
###########################
## 07_icr_checks.R #######
#########################

# setup
library(pacman)
p_load(textreadr, dplyr, stringr, tidyr, haven, tokenizers, here, readxl, 
       magrittr, stringdist, ggplot2, tidyverse, icr)
i_am("Code/07_icr_checks.R")


## 1. initial recodes ####

# get list of transcript numbers
transcriptnames <- as.data.frame(list.files(pattern = "docx", path = here("ICR Transcripts")))
transcriptnames <- cbind(seq(1:nrow(transcriptnames)), transcriptnames)
colnames(transcriptnames)[2] <- "names"
transcriptnames$transcriptnumbers<- str_remove_all(transcriptnames$names,"[qwertyuioplkjhgfdsazxcvbnmQWERTYUIOPLKJHGFDSAZXCVBNM-]")
transcriptnames$transcriptnumbers<-str_remove_all(transcriptnames$transcriptnumbers," |\\.|\\_")

#manually add coder identifiers
transcriptnames$coder <- c("G", "K", "H", "R", "J", 
                           "N", "H", "R", "J", "G",
                           "B", "K", "N", "H", "R",
                           "J", "G", "K", "H", "R",
                           "J", "G", "B", "K", "R")

# read all the docs, format with 1 line of speech per row
docs <- as.data.frame(matrix(nrow=1, ncol=4))
colnames(docs) <- c("juror", "text", "transcript", "coder")

for (i in 1:nrow(transcriptnames)) {
  #grab doc, read in list w/ one speaking turn per row
  jury<-textreadr::read_docx(here("ICR Transcripts", transcriptnames$names[i]))
  DFjury<-as.data.frame(jury)
  DFjury[,1]<-as.character(DFjury[,1])
  #separate into speaker and words with colon
  DFjury_separated<- DFjury %>% separate(jury, c("juror", "text"), ":", extra="merge")
  
  #clean up text and export data by speaking turns
  #into folder of csv's for each transcript
  DFjury_separated[,2]<-str_replace_all(DFjury_separated[,2], "[’]", "'") 
  DFjury_separated[,2]<-str_replace_all(DFjury_separated[,2], "[…]", "...")
  
  #clean up juror names
  DFjury_separated$juror<-str_remove_all(DFjury_separated$juror," ")
  DFjury_separated$juror<-toupper(DFjury_separated$juror)
  
  #add indicators 
  DFjury_separated$transcript <- transcriptnames$transcriptnumbers[i]
  DFjury_separated$coder <- transcriptnames$coder[i]
  
  docs <- bind_rows(docs, DFjury_separated)
}

#create a unique ID for each coder/speaker/transcript combo
docs$id <- paste(docs$juror, docs$transcript, docs$coder)

#filter non-speaker IDs
docs$filter <- grepl("WEEKEND|TAPE|SCENARIO|TRANSCRIBER|ENVELOPE|TRANSCRIBED", docs$juror)
docs$filter <- ifelse(is.na(docs$text), TRUE, docs$filter)
docs %<>%
  filter(filter==FALSE)

### paste all speech:
#reshape: 1 line per coder/speaker/transcript combo
people <- docs %>%
  group_by(id) %>%
  summarize(text = paste(text, collapse=" "),
            transcript = sample(transcript,1),
            coder = sample(coder, 1))

## 2. manual recodes to remove speech from non-jurors and locate similar speech profiles ####
  # transcript 104314 ####

df104314 <- people %>% filter(transcript == 104314)
#take out non-main speakers
df104314 <- df104314[-c(1:4, 11:19, 26:33, 58:59),]


# calculate similarity between all RA/RA and speaker/speaker pairs 
comp <- matrix(nrow=nrow(df104314), ncol=nrow(df104314))
# for each pair of speech profiles...
for(i in 1:nrow(df104314)){
  for(j in 1:nrow(df104314)){
    # if the profiles have the same coder or will be a duplicate across diagonals, set similarity to NA
    if(df104314$coder[i]==df104314$coder[j]|i>=j){
      comp[i,j] <- NA
    }
    # otherwise, calculate the string distance between the profiles
    if(df104314$coder[i]!=df104314$coder[j]&(i<j)){
  comp[i,j] <- stringsim(df104314$text[i], df104314$text[j])
    }
  }
  # print progress (speeds up as it goes)
  print(paste("finished row", i))
}
# project similarity across diagonals
for(i in 1:nrow(df104314)){
  for(j in 1:nrow(df104314)){
    if(i>=j){
      comp[i,j] <- comp[j,i]
    }
  }
}
comp <- as.data.frame(comp)

# note total profile length and row numbers
df104314$length <- nchar(df104314$text)
comp$rownum <- seq(1:nrow(df104314))

# calculate how much each profile deviates from the others it's most similar to
df104314$dev <- NA
for(i in 1:nrow(df104314)){
  most <- comp$rownum[order(comp[,paste("V", i, sep="")], decreasing=TRUE)][1:5]
  df104314$dev[i] <- mean(comp[most, paste("V", i, sep="")], na.rm=T)
}


  # transcript 1642 ####

df1642 <- people %>% filter(transcript == 1642)
#take out non-main speakers
df1642 <- df1642[-c(16:24, 40),]

comp <- matrix(nrow=nrow(df1642), ncol=nrow(df1642))
for(i in 1:nrow(df1642)){
  for(j in 1:nrow(df1642)){
    if(df1642$coder[i]==df1642$coder[j]|i>=j){
      comp[i,j] <- NA
    }
    if(df1642$coder[i]!=df1642$coder[j]&(i<j)){
      comp[i,j] <- stringsim(df1642$text[i], df1642$text[j])
    }
  }
  print(paste("finished row", i))
}
for(i in 1:nrow(df1642)){
  for(j in 1:nrow(df1642)){
    if(i>=j){
      comp[i,j] <- comp[j,i]
    }
  }
}
comp <- as.data.frame(comp)

df1642$length <- nchar(df1642$text)
comp$rownum <- seq(1:nrow(df1642))
df1642$dev <- NA
df1642$devprop <- NA
for(i in 1:nrow(df1642)){
  most <- comp$rownum[order(comp[,paste("V", i, sep="")], decreasing = TRUE)][1:4]
  df1642$dev[i] <- median(comp[most, paste("V", i, sep="")], na.rm=T)
  df1642$devprop[i] <- median(comp[most, paste("V", i, sep="")], na.rm=T)/df1642$length[i]
}

  # transcript 17226 ####
df17226 <- people %>% filter(transcript == 17226)
#take out non-main speakers
df17226 <- df17226[-c(5, 12, 14:25, 36, 39:44, 62:68),]

comp <- matrix(nrow=nrow(df17226), ncol=nrow(df17226))
for(i in 1:nrow(df17226)){
  for(j in 1:nrow(df17226)){
    if(df17226$coder[i]==df17226$coder[j]|i>=j){
      comp[i,j] <- NA
    }
    if(df17226$coder[i]!=df17226$coder[j]&(i<j)){
      comp[i,j] <- stringsim(df17226$text[i], df17226$text[j])
    }
  }
  print(paste("finished row", i))
}
for(i in 1:nrow(df17226)){
  for(j in 1:nrow(df17226)){
    if(i>=j){
      comp[i,j] <- comp[j,i]
    }
  }
}
comp <- as.data.frame(comp)

df17226$length <- nchar(df17226$text)
comp$rownum <- seq(1:nrow(df17226))
df17226$dev <- NA
df17226$devprop <- NA
for(i in 1:nrow(df17226)){
  most <- comp$rownum[order(comp[,paste("V", i, sep="")], decreasing=TRUE)][1:5]
  df17226$dev[i] <- median(comp[most, paste("V", i, sep="")], na.rm=T)
  df17226$devprop[i] <- median(comp[most, paste("V", i, sep="")], na.rm=T)/df17226$length[i]
}

  # transcript 19327 ####
df19327 <- people %>% filter(transcript == 19327)
#take out non-main speakers
df19327 <- df19327[-c(13:17, 30:35),]

comp <- matrix(nrow=nrow(df19327), ncol=nrow(df19327))
for(i in 1:nrow(df19327)){
  for(j in 1:nrow(df19327)){
    if(df19327$coder[i]==df19327$coder[j]|i>=j){
      comp[i,j] <- NA
    }
    if(df19327$coder[i]!=df19327$coder[j]&(i<j)){
      comp[i,j] <- stringsim(df19327$text[i], df19327$text[j])
    }
  }
  print(paste("finished row", i))
}
for(i in 1:nrow(df19327)){
  for(j in 1:nrow(df19327)){
    if(i>=j){
      comp[i,j] <- comp[j,i]
    }
  }
}
comp <- as.data.frame(comp)

df19327$length <- nchar(df19327$text)
comp$rownum <- seq(1:nrow(df19327))
df19327$dev <- NA
df19327$devprop <- NA
for(i in 1:nrow(df19327)){
  most <- comp$rownum[order(comp[,paste("V", i, sep="")], decreasing=TRUE)][1:5]
  df19327$dev[i] <- median(comp[most, paste("V", i, sep="")], na.rm=T)
  df19327$devprop[i] <- median(comp[most, paste("V", i, sep="")], na.rm=T)/df19327$length[i]
}



## 3. combine results ####

comb <- bind_rows(df104314, df1642, df17226, df19327)



## 4. repeat by speech turns instead of characters ####

people <- docs %>% 
    group_by(id) %>%
    summarize(text=list(text),
              transcript = sample(transcript,1),
              coder = sample(coder, 1))

  # transcript 104314 ####
  
df104314 <- people %>% filter(transcript == 104314)
#take out non-main speakers
df104314 <- df104314[-c(1:4, 11:19, 26:33, 58:59),]

# check whether each speech turn is in each other speech profile
comp <- matrix(nrow=nrow(df104314), ncol=nrow(df104314))
for(i in 1:nrow(df104314)){
  for(j in 1:nrow(df104314)){
    if(df104314$coder[i]==df104314$coder[j]){
      comp[i,j] <- NA
    }
    if(df104314$coder[i]!=df104314$coder[j]){
      in_i_not_j <- sum(!(df104314$text[[i]] %in% df104314$text[[j]]))
      in_j_not_i <- sum(!(df104314$text[[j]] %in% df104314$text[[i]]))
      n <- length(df104314$text[[i]]) + length(df104314$text[[j]])
      comp[i,j] <- 1-((in_i_not_j+in_j_not_i)/n)
    }
  }
}

comp <- as.data.frame(comp)

# record length and number
df104314$length <- nchar(df104314$text)
comp$rownum <- seq(1:nrow(df104314))

# calculate distance from closest neighbors
df104314$dev <- NA
for(i in 1:nrow(df104314)){
  most <- comp$rownum[order(comp[,paste("V", i, sep="")], decreasing=TRUE)][1:5]
  df104314$dev[i] <- mean(comp[most, paste("V", i, sep="")], na.rm=T)
  df104314$most[i] <- list(most)
}

  # transcript 1642 ####

df1642 <- people %>% filter(transcript == 1642)
#take out non-main speakers
df1642 <- df1642[-c(16:24, 40),]

comp <- matrix(nrow=nrow(df1642), ncol=nrow(df1642))
for(i in 1:nrow(df1642)){
  for(j in 1:nrow(df1642)){
    if(df1642$coder[i]==df1642$coder[j]){
      comp[i,j] <- NA
    }
    if(df1642$coder[i]!=df1642$coder[j]){
      in_i_not_j <- sum(!(df1642$text[[i]] %in% df1642$text[[j]]))
      in_j_not_i <- sum(!(df1642$text[[j]] %in% df1642$text[[i]]))
      n <- length(df1642$text[[i]]) + length(df1642$text[[j]])
      comp[i,j] <- 1-((in_i_not_j+in_j_not_i)/n)
    }
  }
}

comp <- as.data.frame(comp)

df1642$length <- nchar(df1642$text)
comp$rownum <- seq(1:nrow(df1642))
df1642$dev <- NA
for(i in 1:nrow(df1642)){
  most <- comp$rownum[order(comp[,paste("V", i, sep="")], decreasing=TRUE)][1:4]
  df1642$dev[i] <- mean(comp[most, paste("V", i, sep="")], na.rm=T)
  df1642$most[i] <- list(most)
}


  # transcript 17226 ####

df17226 <- people %>% filter(transcript == 17226)
#take out non-main speakers
df17226 <- df17226[-c(5, 12, 14:25, 36, 39:44, 62:68),]

comp <- matrix(nrow=nrow(df17226), ncol=nrow(df17226))
for(i in 1:nrow(df17226)){
  for(j in 1:nrow(df17226)){
    if(df17226$coder[i]==df17226$coder[j]){
      comp[i,j] <- NA
    }
    if(df17226$coder[i]!=df17226$coder[j]){
      in_i_not_j <- sum(!(df17226$text[[i]] %in% df17226$text[[j]]))
      in_j_not_i <- sum(!(df17226$text[[j]] %in% df17226$text[[i]]))
      n <- length(df17226$text[[i]]) + length(df17226$text[[j]])
      comp[i,j] <- 1-((in_i_not_j+in_j_not_i)/n)
    }
  }
}

comp <- as.data.frame(comp)

df17226$length <- nchar(df17226$text)
comp$rownum <- seq(1:nrow(df17226))
df17226$dev <- NA
for(i in 1:nrow(df17226)){
  most <- comp$rownum[order(comp[,paste("V", i, sep="")], decreasing=TRUE)][1:5]
  df17226$dev[i] <- mean(comp[most, paste("V", i, sep="")], na.rm=T)
  df17226$most[i] <- list(most)
}


  # transcript 19327 ####

df19327 <- people %>% filter(transcript == 19327)
#take out non-main speakers
df19327 <- df19327[-c(13:17, 30:35),]

comp <- matrix(nrow=nrow(df19327), ncol=nrow(df19327))
for(i in 1:nrow(df19327)){
  for(j in 1:nrow(df19327)){
    if(df19327$coder[i]==df19327$coder[j]){
      comp[i,j] <- NA
    }
    if(df19327$coder[i]!=df19327$coder[j]){
      in_i_not_j <- sum(!(df19327$text[[i]] %in% df19327$text[[j]]))
      in_j_not_i <- sum(!(df19327$text[[j]] %in% df19327$text[[i]]))
      n <- length(df19327$text[[i]]) + length(df19327$text[[j]])
      comp[i,j] <- 1-((in_i_not_j+in_j_not_i)/n)
    }
  }
}

comp <- as.data.frame(comp)

df19327$length <- nchar(df19327$text)
comp$rownum <- seq(1:nrow(df19327))
df19327$dev <- NA
for(i in 1:nrow(df19327)){
  most <- comp$rownum[order(comp[,paste("V", i, sep="")], decreasing=TRUE)][1:5]
  df19327$dev[i] <- mean(comp[most, paste("V", i, sep="")], na.rm=T)
  df19327$most[i] <- list(most)
}


## 5. combine results ####
comb <- bind_rows(df104314, df1642, df17226, df19327)
comb %<>% filter(dev!=0)

mean(comb$dev)


## 6. repeat by speech turns - exclude turns with <=3 words ####
people <- docs %>% 
  mutate(nwords = str_count(text, "\\S+")) %>%
  group_by(id) %>%
  summarize(text=list(text[nwords>3]),
            transcript = sample(transcript,1),
            coder = sample(coder, 1))
  #then rerun each transcript chunk under header 4


mean(comb$dev)
weighted.mean(comb$dev, w=comb$length)

## 7. repeat by words- exclude turns with <=3 words ####

people <- docs %>%
  mutate(nwords = str_count(text, "\\S+")) %>%
  group_by(id) %>%
  summarize(text = paste(text[nwords>3], collapse=" "),
            transcript = sample(transcript,1),
            coder = sample(coder, 1))
#rerun each chunk under header 2

comb <- bind_rows(df104314, df1642, df17226, df19327)

comb %<>% filter(dev>0)
mean(comb$dev)
weighted.mean(comb$dev, w=comb$length)



### RESET for second set of checks ####
rm(comb, comp, df104314, df1642, df17226, df19327, DFjury, DFjury_separated, docs, people, transcriptnames, 
   i, in_i_not_j, in_j_not_i, j, jury, most, n)
########################################
## 1. read and recode data ####

# Read the working file
both_coders <- read_csv(here("Data", "icr_identified_jurors.csv"))


# Transpose the data frame. For each jurynum, there will be two rows and six columns, one row for each coder and six columns representing the six case_ids in the original object. 

# Step 1: Sort within each jurynum based on case_id
data_sorted <- both_coders %>%
  arrange(jurynum, case_id)

# Step 2: Create jurornum variable within each jurynum
data_sorted <- data_sorted %>%
  group_by(jurynum) %>%
  mutate(jurornum = row_number()) %>%
  ungroup()

# Step 3: Reshape to wide format for Identifier 1
wide_id1 <- data_sorted %>%
  pivot_wider(id_cols = jurynum,
              names_from = jurornum,
              values_from = 'Identifier 1',
              names_prefix = "id",
              names_sort = TRUE)

# Step 4: Reshape to wide format for Identifier 2
wide_id2 <- data_sorted %>%
  pivot_wider(id_cols = jurynum,
              names_from = jurornum,
              values_from = 'Identifier 2',
              names_prefix = "id",
              names_sort = TRUE)

check_icr <- bind_rows(wide_id1, wide_id2) %>%
  arrange(jurynum)

rm(data_sorted, wide_id1, wide_id2)


# Make a vector with the unique jurynum values in icr_check
jurynum <- unique(check_icr$jurynum)

# For each of the values in the jurynum vector, separate into 25 different objects, and each object should be labeled with the appropriate jurynum .
# Each object should contain the two coders' responses for a single juror.

for (i in jurynum) {
  assign(paste0("icr_", i), check_icr %>%
           filter(jurynum == i) %>%
           select(starts_with("id")))
}

##Now add the first 5 ICR tests (completed earlier and sent to Elizabeth by email)

icr_1001 <- t(cbind(c(1120, 1123, 1121, 0, 1122, 0),
                    c(1120, 1123, 1121, 0, 1122, 0)))
icr_1002 <- t(cbind(c(1694, 1689, 1691, 1692, 1693, 1690),
                    c(1694, 1689, 1692, 1691, 1693, 1690)))
icr_1003 <- t(cbind(c(1136, 1137, 1141, 1140, 1139, 1138),
                    c(1136, 1137, 1141, 1138, 1139, 1140)))
icr_1004 <- t(cbind(c(731, 730, 732, 729, 733, 728),
                    c(731, 730, 732, 729, 730, 728)))
icr_1005 <- t(cbind(c(195, 196, 194, 197, 198, 193),
                    c(195, 196, 194, 197, 198, 193)))


# List all icr objects in the global environment
objects_icr <- ls(pattern = "^icr_")

## 2. calculate reliability ####

# Initialize empty vectors to store labels and alpha values
labels <- character(length(objects_icr))
alpha_values <- numeric(length(objects_icr))

# Loop through each object, compute Krippendorff's alpha, and store results
for (i in seq_along(objects_icr)) {
  # Get object name
  object_name <- objects_icr[i]
  
  # Compute Krippendorff's alpha
  alpha <- krippalpha(get(object_name))$alpha
  
  # Store label (object name without "icr_") and alpha value
  labels[i] <- sub("^icr_", "", object_name)
  alpha_values[i] <- alpha
}

# Create a dataframe with labels and alpha values
alpha_data <- data.frame(label = as.numeric(labels), alpha = alpha_values) %>%
  arrange(label)

#Compute the mean alphas across all tested juries
mean(alpha_data$alpha)


# Check matches between coders for each juror by creating a new object that contains a column showing whether Identifer 1 and Identifier 2 match
coder_check <- both_coders %>%
  mutate(match = ifelse(`Identifier 1` == `Identifier 2`, "Match", "No Match"))

# Compute the total percentage matching by counting the number of matches and dividing by the total number of rows
total_matches <- coder_check %>%
  filter(match == "Match") %>%
  nrow()

total_rows <- nrow(coder_check)

total_matches / total_rows




